Group04 Presentation R4BDS

Eric Calaveras - Sol Escamilla - Beatriz García - Lucía López - Diego Medina

INTRODUCTION

Introduction


MATERIALS AND METHODS

Materials and methods

Data Wrangling: From messy to tidy data




borovecki <- borovecki$x |>
  as_tibble() |>
  mutate(
    sample = row_number(), 
    .before = 1, 
    diagnosis = borovecki$y
  )

borovecki <- borovecki |>
  pivot_longer(
    cols = ends_with("at"),
    names_to = "probeID", 
    values_to = "expressionValue",
    values_drop_na = TRUE
  )

Materials and methods

Data Wrangling: From messy to tidy data




borovecki <- borovecki |>
  mutate(
    gene = mapIds(
      hgu133a.db, 
      keys=probeID, 
      keytype = "PROBEID", 
      column="SYMBOL",
      multiVals="first"
    )
  ) |>
  filter(!is.na(gene)) |>
  dplyr::select(-probeID)

borovecki_geneLevel <- borovecki |>
  group_by(sample, gene) |>
  summarise(expression = mean(expression), .groups = "drop")

Materials and methods

Normalization: From raw to standardized data


raw_histogram <- ggplot(borovecki_data, aes(x = expressionValue,fill = "Raw data")) +
  geom_histogram(bins = 50, alpha = 0.6) +
  fill_scale+
  theme_minimal(base_size = 14) +
  labs(x = "Expression value",
       y = "Count") +
  theme(plot.title = element_text(size = 13, 
                                  hjust = 0.5)) 

log2_histogram <- ggplot(borovecki_data, aes(x = log2_expression,  fill = "Log2 data")) +
  geom_histogram(bins = 100, alpha = 0.6) +
  fill_scale +
  theme_minimal(base_size = 14) +
  labs(x = "log2(Expression value)",
       y = "Count") +
  theme(plot.title = element_text(size = 13,
                                  hjust = 0.5))
raw_histogram + log2_histogram + 
  plot_layout(guides = "collect")+
  plot_annotation(title="Comparison between Raw and Log2-Transformed Expression Data", 
                  caption="Data from: Borovecki et al. 2005") &
  theme(legend.position = "right")

Materials and methods

Linear Modeling


borovecki_tibble <- borovecki_tibble |>
  pivot_wider(
    # Columns not to pivot
    id_cols = c(sample, diagnosis),
    # Columns to pivot
    names_from = gene,
    values_from = expressionValue
  ) |>
  mutate(
    Diagnosis = case_when(
      diagnosis == "symptomatic" ~ 1,
      diagnosis == "control" ~ 0
    )
  ) |>
  dplyr::select(-c(diagnosis,sample)) |>
  relocate(Diagnosis, .before = 1)

borovecki_tibble_long <- borovecki_tibble  |> 
  pivot_longer(
    cols = -Diagnosis,
    names_to = "gene",
    values_to = "log2_expr_level"
  ) |>
  mutate(
    log2_expr_level = log2(log2_expr_level) 
  )
borovecki_tibble_long_nested <- borovecki_tibble_long |> 
  group_by(gene) |> 
  nest() |> 
  ungroup()

Materials and methods

Linear Modeling


by_gene <- borovecki_tibble_long_nested |> 
  group_by(gene)
linear_model <- function(df) {
  lm(
  log2_expr_level ~ Diagnosis , 
  data = df)
}
borovecki_tibble_long_nested  <- borovecki_tibble_long_nested  |>  
  mutate(model_object = map(data, linear_model))
tidy_data <- function(df) {
  df |>
  tidy(conf.int = TRUE,
       conf.level = 0.95)
  }
borovecki_estimates <- borovecki_tibble_long_nested |>
  mutate(model_object_tidy = map(model_object, tidy_data)) |>
  unnest(model_object_tidy) |>
  filter(term == "Diagnosis") |>
  dplyr::select(gene, p.value, estimate, conf.low, conf.high) |> 
  mutate(q.value=p.adjust(p.value),
          is_significant = case_when(
            q.value >= 0.05 ~ "no",
            q.value < 0.05 ~ "yes"
            )

RESULTS

Results

Dataset exploration in established genetic modifiers

Results

Forest Plot

Results

Results

Volcano Plot

DISCUSSION

Discussion


Bioinformatics is an essential tool in modern biological research.



Collaboration between scientists and bioinformaticians allows for a better understanding of how diseases work

Thank you